Merge pull request #234 from afro88/WebsiteAgentURLArrays

Website agent url arrays

Andrew Cantino преди 11 години
родител
ревизия
f658dd884c
променени са 2 файла, в които са добавени 98 реда и са изтрити 60 реда
  1. 74 60
      app/models/agents/website_agent.rb
  2. 24 0
      spec/models/agents/website_agent_spec.rb

+ 74 - 60
app/models/agents/website_agent.rb

@@ -16,6 +16,8 @@ module Agents
16 16
 
17 17
       Specify a `url` and select a `mode` for when to create Events based on the scraped data, either `all` or `on_change`.
18 18
 
19
+      `url` can be a single url, or an array of urls (for example, for multiple pages with the exact same structure but different content to scrape)
20
+
19 21
       The `type` value can be `xml`, `html`, or `json`.
20 22
 
21 23
       To tell the Agent how to parse the content, specify `extract` as a hash with keys naming the extractions and values of hashes.
@@ -107,85 +109,97 @@ module Agents
107 109
       log "Fetching #{options['url']}"
108 110
       request_opts = { :followlocation => true }
109 111
       request_opts[:userpwd] = options['basic_auth'] if options['basic_auth'].present?
110
-      request = Typhoeus::Request.new(options['url'], request_opts)
111 112
 
112
-      request.on_failure do |response|
113
-        error "Failed: #{response.inspect}"
113
+      requests = []
114
+
115
+      if options['url'].kind_of?(Array)
116
+        options['url'].each do |url|
117
+           requests.push(Typhoeus::Request.new(url, request_opts))
118
+        end
119
+      else
120
+        requests.push(Typhoeus::Request.new(options['url'], request_opts))
114 121
       end
115 122
 
116
-      request.on_success do |response|
117
-        body = response.body
118
-        if (encoding = options['force_encoding']).present?
119
-          body = body.encode(Encoding::UTF_8, encoding)
123
+      requests.each do |request|
124
+        request.on_failure do |response|
125
+          error "Failed: #{response.inspect}"
120 126
         end
121
-        doc = parse(body)
122 127
 
123
-        if extract_full_json?
124
-          if store_payload!(previous_payloads(1), doc)
125
-            log "Storing new result for '#{name}': #{doc.inspect}"
126
-            create_event :payload => doc
128
+        request.on_success do |response|
129
+          body = response.body
130
+          if (encoding = options['force_encoding']).present?
131
+            body = body.encode(Encoding::UTF_8, encoding)
127 132
           end
128
-        else
129
-          output = {}
130
-          options['extract'].each do |name, extraction_details|
131
-            if extraction_type == "json"
132
-              result = Utils.values_at(doc, extraction_details['path'])
133
-              log "Extracting #{extraction_type} at #{extraction_details['path']}: #{result}"
134
-            else
135
-              case
136
-              when css = extraction_details['css']
137
-                nodes = doc.css(css)
138
-              when xpath = extraction_details['xpath']
139
-                nodes = doc.xpath(xpath)
133
+          doc = parse(body)
134
+
135
+          if extract_full_json?
136
+            if store_payload!(previous_payloads(1), doc)
137
+              log "Storing new result for '#{name}': #{doc.inspect}"
138
+              create_event :payload => doc
139
+            end
140
+          else
141
+            output = {}
142
+            options['extract'].each do |name, extraction_details|
143
+              if extraction_type == "json"
144
+                result = Utils.values_at(doc, extraction_details['path'])
145
+                log "Extracting #{extraction_type} at #{extraction_details['path']}: #{result}"
140 146
               else
141
-                error "'css' or 'xpath' is required for HTML or XML extraction"
142
-                return
143
-              end
144
-              unless Nokogiri::XML::NodeSet === nodes
145
-                error "The result of HTML/XML extraction was not a NodeSet"
146
-                return
147
-              end
148
-              result = nodes.map { |node|
149
-                if extraction_details['attr']
150
-                  node.attr(extraction_details['attr'])
151
-                elsif extraction_details['text']
152
-                  node.text()
147
+                case
148
+                when css = extraction_details['css']
149
+                  nodes = doc.css(css)
150
+                when xpath = extraction_details['xpath']
151
+                  nodes = doc.xpath(xpath)
153 152
                 else
154
-                  error "'attr' or 'text' is required on HTML or XML extraction patterns"
153
+                  error "'css' or 'xpath' is required for HTML or XML extraction"
155 154
                   return
156 155
                 end
157
-              }
158
-              log "Extracting #{extraction_type} at #{xpath || css}: #{result}"
156
+                unless Nokogiri::XML::NodeSet === nodes
157
+                  error "The result of HTML/XML extraction was not a NodeSet"
158
+                  return
159
+                end
160
+                result = nodes.map { |node|
161
+                  if extraction_details['attr']
162
+                    node.attr(extraction_details['attr'])
163
+                  elsif extraction_details['text']
164
+                    node.text()
165
+                  else
166
+                    error "'attr' or 'text' is required on HTML or XML extraction patterns"
167
+                    return
168
+                  end
169
+                }
170
+                log "Extracting #{extraction_type} at #{xpath || css}: #{result}"
171
+              end
172
+              output[name] = result
159 173
             end
160
-            output[name] = result
161
-          end
162 174
 
163
-          num_unique_lengths = options['extract'].keys.map { |name| output[name].length }.uniq
175
+            num_unique_lengths = options['extract'].keys.map { |name| output[name].length }.uniq
164 176
 
165
-          if num_unique_lengths.length != 1
166
-            error "Got an uneven number of matches for #{options['name']}: #{options['extract'].inspect}"
167
-            return
168
-          end
169
-      
170
-          old_events = previous_payloads num_unique_lengths.first
171
-          num_unique_lengths.first.times do |index|
172
-            result = {}
173
-            options['extract'].keys.each do |name|
174
-              result[name] = output[name][index]
175
-              if name.to_s == 'url'
176
-                result[name] = URI.join(options['url'], result[name]).to_s if (result[name] =~ URI::DEFAULT_PARSER.regexp[:ABS_URI]).nil?
177
-              end
177
+            if num_unique_lengths.length != 1
178
+              error "Got an uneven number of matches for #{options['name']}: #{options['extract'].inspect}"
179
+              return
178 180
             end
181
+        
182
+            old_events = previous_payloads num_unique_lengths.first
183
+            num_unique_lengths.first.times do |index|
184
+              result = {}
185
+              options['extract'].keys.each do |name|
186
+                result[name] = output[name][index]
187
+                if name.to_s == 'url'
188
+                  result[name] = URI.join(options['url'], result[name]).to_s if (result[name] =~ URI::DEFAULT_PARSER.regexp[:ABS_URI]).nil?
189
+                end
190
+              end
179 191
 
180
-            if store_payload!(old_events, result)
181
-              log "Storing new parsed result for '#{name}': #{result.inspect}"
182
-              create_event :payload => result
192
+              if store_payload!(old_events, result)
193
+                log "Storing new parsed result for '#{name}': #{result.inspect}"
194
+                create_event :payload => result
195
+              end
183 196
             end
184 197
           end
185 198
         end
199
+
200
+        hydra.queue request
201
+        hydra.run
186 202
       end
187
-      hydra.queue request
188
-      hydra.run
189 203
     end
190 204
 
191 205
     private

+ 24 - 0
spec/models/agents/website_agent_spec.rb

@@ -91,6 +91,30 @@ describe Agents::WebsiteAgent do
91 91
         @checker.check
92 92
         @checker.logs.first.message.should =~ /Got an uneven number of matches/
93 93
       end
94
+
95
+      it "should accept an array for url" do
96
+        @site['url'] = ["http://xkcd.com/1/", "http://xkcd.com/2/"]
97
+        @checker.options = @site
98
+        lambda { @checker.save! }.should_not raise_error;
99
+        lambda { @checker.check }.should_not raise_error;
100
+      end
101
+
102
+      it "should parse events from all urls in array" do
103
+        lambda {
104
+          @site['url'] = ["http://xkcd.com/", "http://xkcd.com/"]
105
+          @site['mode'] = 'all'
106
+          @checker.options = @site
107
+          @checker.check
108
+        }.should change { Event.count }.by(2)
109
+      end
110
+
111
+      it "should follow unique rules when parsing array of urls" do
112
+        lambda {
113
+          @site['url'] = ["http://xkcd.com/", "http://xkcd.com/"]
114
+          @checker.options = @site
115
+          @checker.check
116
+        }.should change { Event.count }.by(1)
117
+      end
94 118
     end
95 119
 
96 120
     describe 'encoding' do